import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,accuracy_score, classification_report,r2_score
from sklearn.preprocessing import StandardScaler
import pandas_profiling as pp
from pandas_profiling import*
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')
import pickle
df=pd.read_excel("creditcarddefault.xlsx")
df
| ID | LIMIT_BAL | SEX | EDUCATION | MARRIAGE | AGE | PAY_0 | PAY_2 | PAY_3 | BILL_AMT1 | BILL_AMT2 | BILL_AMT3 | PAY_AMT1 | PAY_AMT2 | PAY_AMT3 | default.payment.next.month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 20000 | 2 | 2 | 1 | 24 | 2 | 2 | -1 | 3913 | 3102 | 689 | 0 | 689 | 0 | 1 |
| 1 | 2 | 120000 | 2 | 2 | 2 | 26 | -1 | 2 | 0 | 2682 | 1725 | 2682 | 0 | 1000 | 1000 | 1 |
| 2 | 3 | 90000 | 2 | 2 | 2 | 34 | 0 | 0 | 0 | 29239 | 14027 | 13559 | 1518 | 1500 | 1000 | 0 |
| 3 | 4 | 50000 | 2 | 2 | 1 | 37 | 0 | 0 | 0 | 46990 | 48233 | 49291 | 2000 | 2019 | 1200 | 0 |
| 4 | 5 | 50000 | 1 | 2 | 1 | 57 | -1 | 0 | -1 | 8617 | 5670 | 35835 | 2000 | 36681 | 10000 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 29995 | 29996 | 220000 | 1 | 3 | 1 | 39 | 0 | 0 | 0 | 188948 | 192815 | 208365 | 8500 | 20000 | 5003 | 0 |
| 29996 | 29997 | 150000 | 1 | 3 | 2 | 43 | -1 | -1 | -1 | 1683 | 1828 | 3502 | 1837 | 3526 | 8998 | 0 |
| 29997 | 29998 | 30000 | 1 | 2 | 2 | 37 | 4 | 3 | 2 | 3565 | 3356 | 2758 | 0 | 0 | 22000 | 1 |
| 29998 | 29999 | 80000 | 1 | 3 | 1 | 41 | 1 | -1 | 0 | -1645 | 78379 | 76304 | 85900 | 3409 | 1178 | 1 |
| 29999 | 30000 | 50000 | 1 | 2 | 1 | 46 | 0 | 0 | 0 | 47929 | 48905 | 49764 | 2078 | 1800 | 1430 | 1 |
30000 rows × 16 columns
df.shape
(30000, 16)
# prof=ProfileReport(df)
# prof